setwd("") # script reads data from & produces output in this directory
# one regression table is generated in the working directory, 5 plots inside the plots pane (rstudio)

#________________________________________________________________libraries.
library(cjoint) # for amce estimator plots
### note: cjoint::amce() includes treatment as "respondent-varying" covariate
### and presents conjoint results by treatment group only. this approach does not give
### an ATE estimate explicitly, we thus also build our own OLS models with clustered SE to get these
library(lmtest) # clustered standard errors for OLS models
library(texreg) # generate regression tables

library(ggplot2)
library(stringr)
library(reshape2)

#________________________________________________________________set plot theme.
theme <- theme(
  legend.position = "none",
  
  panel.background = element_rect(fill = "gray90", color = "gray90"),
  strip.text = element_text(size = 8),
  strip.background = element_rect(colour = "white"),
  
  panel.grid.major = element_line(colour = "white", linetype = 1),
  panel.grid.minor = element_blank(),
  
  plot.title = element_text(face = "bold", size = 8, hjust = 0.5),
  
  axis.text.x = element_text(size = 8, margin = unit(c(t = 2.5, r = 0, b = 0, l = 0), "mm"), hjust = 0.5, vjust = 1),
  axis.title.x = element_text(size=8, hjust = 0.5),
  
  axis.text.y = element_text(size = 8, margin = unit(c(t = 0, r = 2.5, b = 0, l = 0), "mm"), hjust = 0, vjust = 0.5),
  axis.title.y = element_text(size = 8, vjust = 0.01, hjust = 0.1),
  
  plot.caption = element_text(size=8),
)




#_________________________________________________________________STUDY 2
file <- "Contact+tracing+conjoint_June+1,+2020_17.00 - Copy.csv"

# CREATE WIDE DATA SET. read data dealing with Qualtrics' "triple header"

# store first row of raw data set as vector of variable names
colnames <- read.csv(file, header = FALSE, nrows = 1, as.is = TRUE)

# read rest of data starting with row number 4
# we call this the wide data format (for consistency, but study 2 won't have a long data format like study 1)
wide <- read.csv(file, skip = 3, header = FALSE, stringsAsFactors = FALSE)

# attach variable names saved earlier to rest of data
colnames(wide) <- colnames

#_______________________________________________discard preview and screened out respondents
# discard preview (non-Prolific) respondents 
# screened-out respondents did not comply with Prolific's quotas

# note: the date string, based on which we filter preview respondents,
# may be read with different default formatting on different computers.
# ours looks like this: 06/08/2020 10:35 BST. if filtering below (row 72) produces an empty dataframe,
# then this formatting needs to be changed, e.g. "%Y-%m-%d %H:%M:%S".
# confirm formatting with head(wide$StartDate)
BST <- "%d/%m/%Y %H:%M"

# cutoff time: the survey was not live before that
cutoff <- strptime("18/05/2020 15:30:00 BST", format = BST)

# execute subsetting based on cutoff date:
wide$StartDate <- strptime(wide$StartDate, format = BST)
wide <- wide[wide$StartDate > cutoff & wide$DistributionChannel!="preview" & wide$Progress==100,]

#________________________data management

# dependent variable: the qualtrics-provided coding needs fixing
wide$human_involvement <- ifelse(
  wide$ctrace == 2, 1, ifelse(
    wide$ctrace == 3, 2, ifelse(
      wide$ctrace == 11, 3, ifelse(
        wide$ctrace == 12, 4, ifelse(
          wide$ctrace == 29, 5, ifelse(
            wide$ctrace == 13, 6, 7
          )
        )
      )
    )
  )
)

# treatment variable:
wide$treatment <- ifelse(
  wide$pre_stimulus_DO == "C1", "Control", "Treated"
)

# categorical version of 7-point DV scale
wide$human3 <- ifelse(
  wide$human_involvement %in% 1:2, "Mostly human",ifelse(
    wide$human_involvement %in% 3:5, "Mixture", "Mostly digital"
  )
)

# reorder factor levels
wide$human3 <- factor(
  wide$human3, levels = c("Mostly human","Mixture","Mostly digital")
)

# results plot
print(
  ggplot(wide, aes(x = human3, y = ..prop.., group = 1)) + 
    geom_bar(stat = "count",width = .65, fill = "gray21") +
    facet_grid(. ~ treatment) +
    xlab("") +
    ylab("Proportion") +
    ggtitle("Contact tracing done via...") +
    theme + 
    theme(
      axis.title = element_text(hjust = .5),
      axis.text.x = element_text(angle = 45, hjust = 1)
    )
)

# predicting each category of the DV
human <- glm(
  (human3 == "Mostly human") ~ treatment, wide, family = binomial(link = "logit")
)

mix <- glm(
  (human3 == "Mixture") ~ treatment, wide, family = binomial(link = "logit")
)

digi <- glm(
  (human3 == "Mostly digital") ~ treatment, wide, family = binomial(link = "logit")
)

# regression table
htmlreg(
  list(human,mix,digi),
  custom.model.names = c("Pr(y=\"Mostly human\")",
                         "Pr(y=\"Mixture\")",
                         "Pr(y=\"Mostly digital\")"),
  custom.coef.names = c("Intercept","Treated"),
  "study2.doc"
)

#____________________________________________________study 2 appendix.
wide$gender <- ifelse(
  wide$Q1 == 1, "male", ifelse(
    wide$Q1 == 2, "female", "other"
  )
)

wide$location <- ifelse(
  wide$Q3 == 1, "EE", ifelse(
    wide$Q3 == 2, "EM", ifelse(
      wide$Q3 == 3, "Lon", ifelse(
        wide$Q3 == 4, "NE", ifelse(
          wide$Q3 == 5, "NW", ifelse(
            wide$Q3 == 6, "NI", ifelse(
              wide$Q3 == 7, "Scot", ifelse(
                wide$Q3 == 8, "SE", ifelse(
                  wide$Q3 == 9, "SW", ifelse(
                    wide$Q3 == 10, "Wales", ifelse(
                      wide$Q3 == 11, "WM", "YH"
                    ))
                )
              )
            )
          )
        )
      )
    )
  )
)

wide$age <- ifelse(
  (2020 - wide$Q2) %in% 18:24, "18--24", ifelse(
    (2020 - wide$Q2) %in% 25:34, "25--34", ifelse(
      (2020 - wide$Q2) %in% 35:44, "35--44", ifelse(
        (2020 - wide$Q2) %in% 45:54, "45--54", ifelse(
          (2020 - wide$Q2) %in% 55:64, "55--64", "65+"
        )
      )
    )
  )
)

# three descriptive plots: Appendix A1 right hand side
for (varstring in c("location","gender","age")) {
  print(
    ggplot(wide, aes(x = get(varstring), y = ..prop.., group = 1)) + 
      geom_bar(stat = "count",width = .65, fill = "gray21") +
      xlab("") +
      ylab("Proportion") +
      ggtitle(paste("Study 2",varstring)) +
      theme
  )
}

# treatment compliance
summary(
  table(
    # those who recall term "theft of personal data"
    grepl("1",wide$C8),
    # those in treatment
    wide$pre_stimulus_DO
  )
)

# randomisation check
prop.table(table(wide$pre_stimulus_DO))

# Appendix C: Descriptive statistics of dependent variable
print(
  ggplot(wide, aes(x = human_involvement, y = ..prop.., group = 1)) + 
    geom_bar(stat = "count",width = .65, fill = "gray21") +
    xlab("") +
    ylab("Proportion") +
    ggtitle("Respondent's preferences regarding human involvement in contact tracing") +
    scale_x_continuous(breaks = c(1:7), labels = c("Human only",2:6,"App only")) +
    theme + 
    theme(
      axis.text.x = element_text(angle = 45,hjust = 1)
    )
)